import zipfile
import os
import random

# 定义路径
zip_file_path = './datasets/chemblv31/train.zip'
train_70_path = './datasets/chemblv31/train_70.txt'
train_30_path = './datasets/chemblv31/train_30.txt'

# 解压缩ZIP文件
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    zip_ref.extractall('./datasets/chemblv31')

# 假设训练集数据存储在解压后的某个文件中，假设文件名为train.txt
train_file_path = './datasets/chemblv31/train.txt'

# 读取所有训练数据
with open(train_file_path, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# 确定要选择的数据量
total_data = len(lines)
train_70_count = int(total_data * 0.7)
train_30_count = total_data - train_70_count

# 打乱数据并分割
random.shuffle(lines)

train_70 = lines[:train_70_count]
train_30 = lines[train_70_count:]

# 将数据写入文件
with open(train_70_path, 'w', encoding='utf-8') as f:
    f.writelines(train_70)

with open(train_30_path, 'w', encoding='utf-8') as f:
    f.writelines(train_30)

print(f"训练数据已成功拆分：70%数据写入{train_70_path}，30%数据写入{train_30_path}")
